These are the libraries to load
pkg <- c('pivottabler', 'tidyverse','tibbletime','anomalize','timetk')
#install.packages(pkg)
library(pivottabler)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ purrr 1.0.2 ✔ tibble 3.2.1
## ✔ readr 2.1.4 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tibbletime)
##
## Attaching package: 'tibbletime'
##
## The following object is masked from 'package:stats':
##
## filter
library(anomalize)
library(timetk)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
Read the revenue and AdSpend
We check the date type of the ‘date’ column and change the type to DATE
The CSV data is avaialble in the github repository
setwd("/Users/gerardogandara/Documents/recast/")
## Read file as CSV
df_spend <- read.csv('https://raw.githubusercontent.com/ggandara13/myrepository/master/acme_spend.csv')
df_revenue <- read.csv('https://raw.githubusercontent.com/ggandara13/myrepository/master/acme_revenue.csv')
# Convert char format to DATE format
df_revenue$date <- as.Date(df_revenue$date, format = "%m/%d/%y")
df_spend$date <- as.Date(df_spend$date, format = "%m/%d/%y")
# Check the format and content of each dataframe
str(df_revenue)
## 'data.frame': 1035 obs. of 5 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ date : Date, format: "2020-01-01" "2020-01-02" ...
## $ revenue_dtc : num 16000 19306 22052 23177 26169 ...
## $ revenue_amazon : num 50000 60332 68914 72428 81778 ...
## $ revenue_walmart: num 34000 41026 46861 49251 55609 ...
str(df_spend)
## 'data.frame': 10350 obs. of 4 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ date : Date, format: "2020-01-01" "2020-01-01" ...
## $ channel: chr "facebook_prospecting" "facebook_retargeting" "google_branded_search" "google_nonbranded_search" ...
## $ spend : num 4154 2679 474 3817 7421 ...
Check there are not NULL values
paste("Total of NULL in Spend:", sum(is.na(df_spend)))
## [1] "Total of NULL in Spend: 69"
paste("\nTotal of NULL in Revenue:", sum(is.na(df_revenue)))
## [1] "\nTotal of NULL in Revenue: 5"
paste("\n\nSummary\n")
## [1] "\n\nSummary\n"
# you can check with summary as well and see the columns
summary(df_spend)
## X date channel spend
## Min. : 1 Min. :2020-01-01 Length:10350 Min. : 0
## 1st Qu.: 2588 1st Qu.:2020-09-15 Class :character 1st Qu.: 1323
## Median : 5176 Median :2021-06-01 Mode :character Median : 5358
## Mean : 5176 Mean :2021-06-01 Mean : 7792
## 3rd Qu.: 7763 3rd Qu.:2022-02-15 3rd Qu.:11711
## Max. :10350 Max. :2022-10-31 Max. :36401
## NA's :69
summary(df_revenue)
## X date revenue_dtc revenue_amazon
## Min. : 1.0 Min. :2020-01-01 Min. : 16000 Min. : 50000
## 1st Qu.: 259.5 1st Qu.:2020-09-15 1st Qu.: 41685 1st Qu.:130464
## Median : 518.0 Median :2021-06-01 Median : 72121 Median :225380
## Mean : 518.0 Mean :2021-06-01 Mean : 72882 Mean :227502
## 3rd Qu.: 776.5 3rd Qu.:2022-02-14 3rd Qu.: 97433 3rd Qu.:303896
## Max. :1035.0 Max. :2022-10-31 Max. :193005 Max. :415952
## NA's :2 NA's :2
## revenue_walmart
## Min. : 34000
## 1st Qu.: 88614
## Median :153184
## Mean :154638
## 3rd Qu.:206464
## Max. :282848
## NA's :1
sum(is.na(df_spend))/nrow(df_spend)*100
## [1] 0.6666667
sum(is.na(df_revenue))/nrow(df_revenue)*100
## [1] 0.4830918
Since we have 1,035 rows, we can remove because that represents less than 1% or just replace with zero
Now, let’s replace with zero since it is a time series
df_revenue[is.na(df_revenue)] <- 0
df_spend[is.na(df_spend)] <- 0
cat("Total of NULL in Spend:", sum(is.na(df_spend)))
## Total of NULL in Spend: 0
cat("\nTotal of NULL in Revenue:", sum(is.na(df_revenue)))
##
## Total of NULL in Revenue: 0
Using lubridate to filter the year of the date, also use the pivottabler library to group/aggregate it
Answer: online_video
yearfunction<-function(dataframe, datecolumn) {
year(dataframe[,datecolumn])
}
df_spend$Year <- yearfunction(df_spend, "date")
df <- df_spend %>% group_by(channel) %>%
filter(lubridate::year(date) %in% c(2021, 2022) )
df <- as.data.frame(df)
# initialice the PIVOT
pt <- PivotTable$new()
# add the df with 2 years
pt$addData(df)
pt$addColumnDataGroups("Year")
pt$addRowDataGroups("channel")
pt$defineCalculation(calculationName="TotalSpend", summariseExpression="sum(spend)")
pt$evaluatePivot()
#covnert to dataframe
df1 <- pt$asDataFrame()
df_summary_final <- df1 %>%
mutate(index_lift = ( (df1[,2]/ df1[,1]) - 1 ) *100 )
df_summary_final<- df_summary_final[order(df_summary_final$index_lift, decreasing = TRUE), ]
# print the first value of the dataframe
paste("The channel with highest LIFT is:", row.names(df_summary_final)[1] )
## [1] "The channel with highest LIFT is: online_video"
Group by day to see the trend and plot
df_revenue_total <- df_revenue %>%
mutate(total_revenue = ( df_revenue$revenue_dtc + df_revenue$revenue_amazon + df_revenue$revenue_walmart ))
# Group by sum using R Base aggregate()
agg_df <- aggregate(df_revenue_total$total_revenue, by=list(df_revenue_total$date), FUN=sum)
colnames(agg_df) <- c('date','revenue')
#Plot sales over 36 months
ggplot(agg_df, aes(x=date, y = revenue)) + geom_line()
paste("Visually we can see some outliers")
## [1] "Visually we can see some outliers"
Now, we can use the anomalties library to identify the values
Use time_decompose() to decompose a time series prior to performing anomaly detection with anomalize(). Typically, anomalize() is performed on the “remainder” of the time series decomposition.
The return has three columns: “remainder_l1” (lower limit for anomalies), “remainder_l2” (upper limit for anomalies), and “anomaly” (Yes/No).
# Convert df to a tibble
df <- as_tibble(agg_df)
class(df)
## [1] "tbl_df" "tbl" "data.frame"
df_anomalized <- df %>%
time_decompose(revenue, method = "stl", merge = TRUE) %>%
anomalize(remainder, method = "iqr", alpha = 0.05, max_anoms = 0.2) %>%
time_recompose()
## Converting from tbl_df to tbl_time.
## Auto-index message: index = date
## frequency = 7 days
## trend = 91 days
#We can then visualize the anomalies using the plot_anomalies() function.
df_anomalized %>% plot_anomalies(ncol = 1, alpha_dots = 0.75)
We can print the detail of the anomalies, for example the last 4
tail(df_anomalized %>%
filter(anomaly == 'Yes'),4)
You can also use the timetk package for dynamic plot
agg_df %>% timetk::plot_anomaly_diagnostics(date,revenue, .facet_ncol = 2)
## frequency = 7 observations per 1 week
## trend = 92 observations per 3 months
We can compute month column before we aggregate and sort to get the popular month
Most profitale month in a year is the month number: 3
# Calculate the month column
monthfunction<-function(dataframe, datecolumn) {
month(dataframe[,datecolumn])
}
agg_df$Month <- monthfunction(agg_df, "date")
# group by month
by_month <- aggregate(agg_df$revenue, by = list(agg_df$Month), FUN = sum)
colnames(by_month) <- c('month','revenue')
by_month <- by_month[order(by_month$revenue, decreasing = TRUE), ]
# print the first value of the dataframe
print(by_month)
## month revenue
## 3 3 58922686
## 1 1 57540788
## 2 2 57270462
## 4 4 51753505
## 5 5 44977988
## 12 12 41153803
## 10 10 33736751
## 11 11 30650581
## 6 6 29816422
## 9 9 23007186
## 7 7 21236448
## 8 8 20124698
paste("Most profitale month in a year is the month number:", row.names(by_month)[1] )
## [1] "Most profitale month in a year is the month number: 3"
We need to merge both dataframes to have the TS of Revenue using AdSped as regressor Convert the dataframe to TS then plot, then plot the results
Visually we can confirm they have a similar pattern.
#select the date and total_revenue of Acme
df_only_revenue <- select(df_revenue_total, date, total_revenue)
#aggregate spend of all media
df_spend_total <- aggregate(df_spend$spend, by=list(df_spend$date), FUN=sum)
colnames(df_spend_total) <- c('date','spend')
df_rev_spend <- merge(df_only_revenue, df_spend_total, by = "date")
#Convert dataframe to time series object using the ts() function
acme_ts <- ts(data = df_rev_spend[,c(2,3)])
# Time plot of both variables
autoplot(acme_ts, facets = TRUE)
Comment: https://getrecast.com/modern-media-mix-modeling/.